#------------------------------------------------------#
#------------------------------------------------------#
#This file calculates the distance from each city in our dataset to one of 18 disposition
#centers, as designated by DLA.
#------------------------------------------------------#
#------------------------------------------------------#
library(geosphere) #distm function
library(dplyr)
library(readstata13)
options(scipen=0, digits=7) #to standardize number of digits when exporting data. all data here is run using these options.

#The DLA sites are from Harris et al.
FACs <- read.csv("raw-data/dlaDispSites.csv")

# load cities location data
uscities <- read.csv("raw-data/2017CensusPlaceLocations.csv") 
#------------------------------------------------------#
#------------------------------------------------------#
#CALCULATE Haverstine DISTANCES FROM CITIES --------
#------------------------------------------------------#
#------------------------------------------------------#
disttoFACs <- as.data.frame(distm(uscities[,c('INTPTLONG','INTPTLAT')], FACs[,c('Longi','Lat')], 
                                  fun=distHaversine))

##change colnames to refer to the disposition centers
colnames(disttoFACs) <- c("DLALewisMcChordWA","DLAStocktonCA","DLAPendletonCA","DLABarstowCA",
                          "DLAHillUT","DLATucsonAZ","DLAFtCarsonCO","DLAFtRileyKS",
                          "DLAFtSamHoustonTX","DLATexarkanaTX","DLAEglinFL","DLAJacksonvilleFL",
                          "DLARobinsGA","DLAFortBraggNC","DLANorfolkVA","DLAFortMeadeMD",
                          "DLAColumbusOH","DLAMechanisburgPA") 

#carry over city, state name and land area from us cities database
disttoFACs <- disttoFACs %>%
  mutate(City=uscities$NAME, State=uscities$USPS,
         landareamiles=uscities$ALAND_SQMI,landareameters=uscities$ALAND,
         closestFACs=pmin(DLALewisMcChordWA,DLAStocktonCA,DLAPendletonCA,DLABarstowCA,
                          DLAHillUT,DLATucsonAZ,DLAFtCarsonCO,DLAFtRileyKS,
                          DLAFtSamHoustonTX,DLATexarkanaTX,DLAEglinFL,DLAJacksonvilleFL,
                          DLARobinsGA,DLAFortBraggNC,DLANorfolkVA,DLAFortMeadeMD,
                          DLAColumbusOH,DLAMechanisburgPA), ##and, create a column for the distance to the closest center
         sixthclosestFACs = apply(.[1:18], 1, function(x) sort(x, decreasing=TRUE)[6]) ) #and to sixth closest center

###final instrument is whether the city is a HIDTA (designated as a high intensity drug trafficking area)
hidta <- read.csv("raw-data/HIDTAcitiesandcounties.csv") %>%
  select(-COUNTRY_CD,-PROVINCE_COUNTY_NAME,-HIDTA_COUNTY_NAME,-IS_HIFCA,-HIFCA_REGION_NAME) %>%
  distinct() #eliminate unnecessary columns from the HIDTA dataset and drop duplicates

# duplicate cities in uscities.csv causing joins to expand
# drop cities that have duplicates ----
tmp <- uscities %>% 
  group_by(NAME, USPS) %>%
  tally() %>%
  filter(n>1)

uscities <- anti_join(uscities,tmp) 
  
## some cities are duplicated but one entry IS_HIDTA=1 and one entry IS_HIDTA=0, assume IS_HIDTA=1
tmp <- hidta %>% 
  group_by(CITY, STATE_CD) %>%
  tally() %>%
  filter(n>1)

to_remove <- left_join(tmp,hidta) %>% 
  filter(IS_HIDTA==0)

hidta <- anti_join(hidta,to_remove)

hidtaanddisttoFACs <- left_join(disttoFACs,hidta,by=c("State"="STATE_CD","City"="CITY"))%>%
  replace(., is.na(.), 0)  #if IS_HIDTA is NA, it means that city is not a HIDTA

###export .csv
write.csv(hidtaanddisttoFACs,"data/cityHARRISinstruments.csv",row.names=F,na="")

#------------------------------------------------------#
#------------------------------------------------------#
#CALCULATE DISTANCES FROM COUNTIES.
#this approach calculates the Haversine distance in meters from each county to each distribution
#center. The Haversine distance is the shortest possible distance between the two points, i.e.
#as the crow flies.
#------------------------------------------------------#
#------------------------------------------------------#
#read in datafile of lat/long of us counties
uscounties <- read.csv("raw-data/2017CensusCountyLocations.csv")

#calculate distances from counties
disttoFACscounties <- as.data.frame(distm(uscounties[,c('INTPTLONG','INTPTLAT')], FACs[,c('Longi','Lat')], 
                                          fun=distHaversine))

##change colnames to refer to the disposition centers
colnames(disttoFACscounties) <- c("DLALewisMcChordWA","DLAStocktonCA","DLAPendletonCA","DLABarstowCA",
                                  "DLAHillUT","DLATucsonAZ","DLAFtCarsonCO","DLAFtRileyKS",
                                  "DLAFtSamHoustonTX","DLATexarkanaTX","DLAEglinFL","DLAJacksonvilleFL",
                                  "DLARobinsGA","DLAFortBraggNC","DLANorfolkVA","DLAFortMeadeMD",
                                  "DLAColumbusOH","DLAMechanisburgPA") 

#carry over names, land area from us counties dataset
disttoFACscounties <- disttoFACscounties %>%
  mutate(County=uscounties$NAME,State=uscounties$USPS,
         landareamiles=uscounties$ALAND_SQMI,landareameters=uscounties$ALAND,
         closestFACs=pmin(DLALewisMcChordWA,DLAStocktonCA,DLAPendletonCA,DLABarstowCA,
                          DLAHillUT,DLATucsonAZ,DLAFtCarsonCO,DLAFtRileyKS,
                          DLAFtSamHoustonTX,DLATexarkanaTX,DLAEglinFL,DLAJacksonvilleFL,
                          DLARobinsGA,DLAFortBraggNC,DLANorfolkVA,DLAFortMeadeMD,
                          DLAColumbusOH,DLAMechanisburgPA), ##and, create a column for the distance to the closest center
         sixthclosestFACs = apply(.[1:18], 1,  ##distance to sixth closest center
                                   function(x) sort(x, decreasing=TRUE)[6]))

#final instrument is whether the county has been designated a hidta
#re-load so have county info
hidta <- read.csv("raw-data/HIDTAcitiesandcounties.csv") %>%
  select(STATE_CD, HIDTA_COUNTY_NAME, IS_HIDTA) %>%
  unique() 

hidtaanddisttoFACscounties <- left_join(disttoFACscounties,hidta,
                                        by=c("State"="STATE_CD",
                                             "County"="HIDTA_COUNTY_NAME")) %>%
  replace(., is.na(.), 0)  #if IS_HIDTA is NA, it means that county is not a HIDTA

###export .csv
write.csv(hidtaanddisttoFACscounties,"data/countyHARRISinstruments.csv",row.names=F,na="")

######COMPARE OUR DISTANCES WITH HARRIS ET AL. DISTANCES 
harris <- read.dta13("raw-data/HPBM-AEJ-1033.dta") %>%
  mutate(closestFAC = (SumUSitems)/(zHUdI1)) 

#see how our values and theirs compare
summary(harris$closestFAC) 
# Min. 1st Qu.  Median    Mean    3rd Qu.    Max. 
#1.926 118.368 191.301   215.318  289.922    707.212
summary((hidtaanddisttoFACscounties$closestFACs/1609.34))
#    Min.  1st Qu.   Median     Mean  3rd Qu. Max. 
#    1.927  119.215  192.546  230.015  295.678 2548.716 